# -*- coding: utf-8 -*-
"""
Created on Tue Dec  7 20:26:29 2021

@author: perlita
@title: project 6
"""

# IMPORTANT NOTE: Run each cell individually and in order.

import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets
import sklearn.model_selection

# Defining our functions
# We shall assume the feature vectors (X matrix)
#   is already augmented for all functions containing
#   X parameter.

def sigmoid(u):
    expu = np.exp(u)
    return expu/(1 + expu)

def cross_entropy(p, q):
    return -p*np.log(q) - (1-p)*np.log(1-q)

def eval_L(beta, X, y, gamma):
    
    N = X.shape[0]
    L = 0.0
    for i in range(N):
        xiHat = X[i]
        yi = y[i]
        q = sigmoid(np.vdot(xiHat, beta))
        L += cross_entropy(yi, q)
        
    reg = (gamma/2)*(np.linalg.norm(beta)**2)
    return L + reg

def grad_L(beta, X, y, gamma):
    
    N = X.shape[0]
    d = X.shape[1] - 1
    grad = np.zeros(d+1)
    
    for i in range(N):
        xiHat = X[i]
        yi = y[i]
        q = sigmoid(np.vdot(xiHat, beta))
        grad += (q - yi)*xiHat
        
    grad = grad*(1/N)
    reg = gamma*beta
    return grad + reg

def Hessian_L(beta, X, y, gamma):
    
    N = X.shape[0]
    d = X.shape[1] - 1
    s_vals = sigmoid(X@beta)
    s_vals = np.reshape(s_vals, (N,1))
    M = (s_vals - s_vals**2) * X
    H = (1/N)*(X.T@M)
    
    reg = gamma*np.identity(d+1)
    return H + reg

#%%

# Implement logistic regression on the breast cancer dataset

# Loading the breast cancer dataset
dataset = sk.datasets.load_breast_cancer()
X = dataset.data
y = dataset.target

# Splitting the breast cancer data
X_train, X_val, y_train, y_val = \
    sk.model_selection.train_test_split(X, y, train_size = .8)
    
# Standardizing the breast cancer data
mu = np.mean(X_train, axis = 0)
s = np.std(X_train, axis = 0)
X_train = (X_train - mu)/s
X_val = (X_val - mu)/s

# Augmenting the breast cancer data
X_train = np.insert(X_train, 0, 1, axis = 1)
X_val = np.insert(X_val, 0, 1, axis = 1)

#%%

# Gradient Descent Method on the breast cancer dataset

max_iter = 500
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_grad = []
gamma = 0.001

for k in range(max_iter):
    
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_grad.append(L)
    grad = grad_L(betak, X_train, y_train, gamma)
    betak = betak - t*grad
    
# Making predictions w/ betak from Gradient Descent
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Gradient Descent Method on the Breast Cancer Dataset: ')
print()
print('Iterations of Gradient Descent used: ', max_iter)
print('Best step size for Gradient Descent: ', t)
print('Accuracy percentage for Gradient Descent: ', accuracy)

#Graphing Gradient Descent to make sure it works properly
plt.semilogy(L_vals_grad)
plt.title('Gradient Descent Method for the Breast Cancer Dataset')

#%%

# Newton's Method on the breast cancer dataset

max_iter = 500
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_newton = []
gamma = 0.001

for k in range(max_iter):
    
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_newton.append(L)
    grad = grad_L(betak, X_train, y_train, gamma)
    H = Hessian_L(betak, X_train, y_train, gamma)
    betak = betak - t*np.linalg.solve(H, grad)
    
# Making predictions w/ betak from Newton's Method
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Newton\'s Method on the Breast Cancer Dataset:')
print()
print('Iterations of Newton\'s Method used: ', max_iter)
print('Best step size for Newton\'s Method: ', t)
print('Accuracy percentage for Newton\'s Method: ', accuracy)

# Graphing Newton's Method to make sure it works properly
plt.semilogy(L_vals_newton)
plt.title('Newton\'s Method for the Breast Cancer Dataset')

#%%

# Stochastic Gradient Descent Method on the breast cancer dataset

num_epochs = 500
t = 0.001
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_stochastic = []
N = X_train.shape[0]
gamma = 0.001

for ep in range(num_epochs):
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_stochastic.append(L)
    shuffled_idxs = np.random.permutation(N)
    
    for i in shuffled_idxs:
        xiHat = X_train[i]
        yi = y_train[i]
        
        gi = (sigmoid(xiHat.T@betak) - yi)*xiHat
        gi = gi + (gamma/N)*betak
        betak = betak - t*gi
        
# Making predictions w/ betak from SGD
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Stochastic Gradient Descent Method on the Breast Cancer Dataset')
print()
print('Number of epochs of SGD used: ', num_epochs)
print('Best step size for SGD: ', t)
print('Accuracy percentage for SGD: ', accuracy)

# Graphing SGD to make sure it works properly
plt.semilogy(L_vals_stochastic)
plt.title('SGD Method for the Breast Cancer Dataset')

#%%

# Graphing all method together for the breast cancer dataset
plt.semilogy(L_vals_grad, label = 'Gradient Descent')
plt.semilogy(L_vals_newton, label = 'Newton\'s Method')
plt.semilogy(L_vals_stochastic, label = 'Stochastic Method')
plt.title('Objective Function Value vs Iteration/Epoch for Breast Cancer Dataset')
plt.legend()

# Final comments for breast cancer dataset:
#   Overall, Newton's method seems to perform the best out of the three.
#   Newton's method converges much faster than the other two, 
#   outperforming them by a significant amount.
#   But despite how fast Newton's method runs, all three have relatively
#   the same accuracy percentage. Thus, how correct a method is does not
#   have much weight. Instead, it's performance and running time is what
#   matters the most in this case.

#%%

# Implement logistic regression on the MNIST dataset

# Loading the MNIST dataset
MNIST_dataset = \
    sk.datasets.fetch_openml('mnist_784', as_frame = False,
                             data_home = \
        '/Users/perli/OneDrive/Documents/MATH 375/projects/project06')
X = MNIST_dataset.data
labels = MNIST_dataset.target
N = len(labels)
y = np.zeros(N)
for i in range(N):
    
    if labels[i] != '0':
        y[i] = 1
        
# Splitting the MNIST data
N_train = 5000
X_train = X[0:N_train]
y_train = y[0:N_train]

X_val = X[N_train:]
y_val = y[N_train:]

# Standardizing the MNIST data
X_train = X_train/255.0
X_val = X_val/255.0

# Augmenting the MNIST data
X_train = np.insert(X_train, 0, 1, axis = 1)
X_val = np.insert(X_val, 0, 1, axis = 1)

#%%

# Gradient Descent Method on the MNIST dataset

max_iter = 500
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_grad = []
gamma = 0.001

for k in range(max_iter):
    
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_grad.append(L)
    grad = grad_L(betak, X_train, y_train, gamma)
    betak = betak - t*grad
  
# Making predictions w/ betak from Gradient Descent
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Gradient Descent Method on the MNIST Dataset: ')
print()
print('Iterations of Gradient Descent used: ', max_iter)
print('Best step size for Gradient Descent: ', t)
print('Accuracy percentage for Gradient Descent: ', accuracy)

# Graphing Gradient Descent to make sure it works properly
plt.semilogy(L_vals_grad)
plt.title('Gradient Descent Method for the MNIST Dataset')

#%%

# Newton's Method on the MNIST dataset

max_iter = 500
t = 0.1
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_newton = []
gamma = 0.001

for k in range(max_iter):
    
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_newton.append(L)
    grad = grad_L(betak, X_train, y_train, gamma)
    H = Hessian_L(betak, X_train, y_train, gamma)
    betak = betak - t*np.linalg.solve(H, grad)
    
# Making predictions w/ betak from Newton's method
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Newton\'s Method on the MNIST Dataset: ')
print()
print('Iterations of Newton\'s Method used: ', max_iter)
print('Best step size for Newton\'s Method: ', t)
print('Accuracy percentage for Newton\'s Method: ', accuracy)

# Graphing Newton's Method to make sure it works properly
plt.semilogy(L_vals_newton)
plt.title('Newton\'s Method for the MNIST Dataset')

#%%

# Stochastic Gradient Descent method on the MNIST dataset

num_epochs = 500
t = 0.001
d = X_train.shape[1] - 1
betak = np.zeros(d+1)
L_vals_stochastic = []
N = X_train.shape[0]
gamma = 0.001

for ep in range(num_epochs):
    L = eval_L(betak, X_train, y_train, gamma)
    L_vals_stochastic.append(L)
    shuffled_idxs = np.random.permutation(N)
    
    for i in shuffled_idxs:
        xiHat = X_train[i]
        yi = y_train[i]
        
        gi = (sigmoid(xiHat.T@betak) - yi)*xiHat
        gi = gi + (gamma/N)*betak
        betak = betak - t*gi
        
# Making predictions w/ betak from SGD
predictions = sigmoid(X_val @ betak)
augmented_predictions = np.zeros(X_val.shape[0])

for i in range(X_val.shape[0]):
    
    if predictions[i] < 0.5:
        augmented_predictions[i] = 0
    else:
        augmented_predictions[i] = 1
        
accuracy = sum(augmented_predictions == y_val)/len(y_val)

print('Stochastic Gradient Descent Method on the MNIST Dataset: ')
print()
print('Number of epochs of SGD used: ', num_epochs)
print('Best step size for SGD: ', t)
print('Accuracy percentage for SGD: ', accuracy)

# Graphing SGD to make sure it works properly
plt.semilogy(L_vals_stochastic)
plt.title('SGD Method for the MNIST Dataset')

#%%

# Graphing all methods together for the MNIST dataset

plt.semilogy(L_vals_grad, label = 'Gradient Descent')
plt.semilogy(L_vals_newton, label = 'Newton\'s Method')
plt.semilogy(L_vals_stochastic, label = 'Stochastic Method')
plt.title('Objective Function Value vs Iteration/Epoch for MNIST Dataset')
plt.legend()

# Final comments for MNIST dataset:
#   Overall, Newton's method performed the best out of the three. Both
#   in terms of coverging much faster and having the highest accuracy
#   percentage. Thus, Newton's method outperformed the other two 
#   methods in all aspects. I believe the reason for this has to do
#   with how much more data MNIST contains compared to the breast 
#   cancer one. The more data there is, the more beneficial it would 
#   be to use either Newton's method or perhaps even Stochastic 
#   method in order to save space and time for such massive datasets.
